Survey Package in R
Pusat Penyelidikan Penyakit Tak Berjangkit, Institut Kesihatan Umum
Sunday, 27 October 2024
sim_pop <- tibble(ethnicity = c(rep("Malay", 460),
rep("Chinese", 330),
rep("Indian", 250),
rep("Borneo", 10))) %>%
mutate(ethnicity = fct_relevel(ethnicity, "Malay", "Chinese", "Indian"))
sim_pop %>%
count(ethnicity) %>% mutate(pct = scales::label_percent()(n/1000))# A tibble: 4 × 3
ethnicity n pct
<fct> <int> <chr>
1 Malay 460 46%
2 Chinese 330 33%
3 Indian 250 25%
4 Borneo 10 1%
# A tibble: 4 × 3
ethnicity n pct
<fct> <int> <chr>
1 Malay 18 36%
2 Chinese 21 42%
3 Indian 10 20%
4 Borneo 1 2%
# A tibble: 4 × 3
ethnicity n pct
<fct> <int> <chr>
1 Malay 15 30%
2 Chinese 19 38%
3 Indian 15 30%
4 Borneo 1 2%
library(arrow)
pop_mydist <- read_parquet("https://storage.dosm.gov.my/population/population_district.parquet") %>%
filter(date == dmy("01/01/23"),
sex != "overall",
age %in% c("20-24", "24-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59"),
ethnicity %in% c("bumi_malay", "chinese", "indian")) %>%
rename(gender = sex) %>%
mutate(gender = fct_recode(gender,
"Male" = "male",
"Female" = "female"),
ethnicity = fct_recode(ethnicity,
"Malay" = "bumi_malay",
"Chinese" = "chinese",
"Indian" = "indian"),
population = population * 1000)# A tibble: 10,080 × 7
state district date gender age ethnicity population
<chr> <chr> <date> <fct> <chr> <fct> <dbl>
1 Johor Batu Pahat 2023-01-01 both 20-24 Malay 30600
2 Johor Batu Pahat 2023-01-01 both 30-34 Malay 26400
3 Johor Batu Pahat 2023-01-01 both 35-39 Malay 26300
4 Johor Batu Pahat 2023-01-01 both 40-44 Malay 23300
5 Johor Batu Pahat 2023-01-01 both 45-49 Malay 18000
6 Johor Batu Pahat 2023-01-01 both 50-54 Malay 16900
7 Johor Batu Pahat 2023-01-01 both 55-59 Malay 15400
8 Johor Batu Pahat 2023-01-01 Male 20-24 Malay 15600
9 Johor Batu Pahat 2023-01-01 Male 30-34 Malay 13400
10 Johor Batu Pahat 2023-01-01 Male 35-39 Malay 13300
# ℹ 10,070 more rows
set.seed(121)
selected_district <- pop_mydist %>%
distinct(state, district) %>%
mutate(zone = case_when(state %in% c("Johor", "Melaka", "Negeri Sembilan") ~ "Selatan",
state %in% c("Kedah", "Perak", "Perlis", "Pulau Pinang") ~ "Utara",
state %in% c("Kelantan", "Pahang", "Terengganu") ~ "Timur",
state %in% c("Selangor", "W.P. Kuala Lumpur", "W.P. Putrajaya") ~ "Tengah",
state %in% c("Sabah", "Sarawak", "W.P. Labuan") ~ "Borneo")) %>%
group_by(zone) %>%
slice_sample(n = 2) %>%
ungroup() %>%
relocate(zone, .before = 1)
selected_district# A tibble: 10 × 3
zone state district
<chr> <chr> <chr>
1 Borneo Sarawak Pusa
2 Borneo Sarawak Asajaya
3 Selatan Melaka Jasin
4 Selatan Johor Muar
5 Tengah Selangor Kuala Selangor
6 Tengah Selangor Ulu Selangor
7 Timur Terengganu Kuala Nerus
8 Timur Kelantan Bachok
9 Utara Pulau Pinang Barat Daya
10 Utara Perak Bagan Datuk
#| eval: false
library(simstudy)
def <- defData(varname = "gender", dist = "binary", formula = 0.5,
link = "identity") %>%
defData(varname = "age", dist = "uniform", formula = "20;59") %>%
defData(varname = "ethnicity", dist = "categorical",
formula = "0.57;0.29;0.14") %>%
defData(varname = "BMI", dist = "normal", formula = 26, variance = 2.6^2) %>%
defData(varname = "height", dist = "normal", formula = 165, variance = 5) %>%
defData(varname = "PAhour", dist = "uniform", formula = "2;6") %>%
defData(varname = "hba1c", dist = "normal", variance = 1.4^2,
formula = "2.4 + 0.05 * age + 0.1 * BMI - 0.15 * PAhour")#| eval: false
set.seed(121)
simnhmsds0 <- genData(400, def) %>%
mutate(gender = case_when(gender == 0 ~ "Male",
gender == 1 ~ "Female"),
agegp = cut(age,
breaks = c(19, 29, 39, 49, 59),
labels = c("20-29", "30-39", "40-49", "50-59")),
ethnicity = case_when(ethnicity == 1 ~ "Malay",
ethnicity == 2 ~ "Chinese",
ethnicity == 3 ~ "Indian"),
weight = BMI * (height/100)^2,
across(.cols = c(hba1c, weight),
.fns = ~ round(., 1)),
across(.cols = c(height),
.fns = ~ round(., 2)),
district = rep(1:10, each = 40),
dm_dx = cut(hba1c,
breaks = c(0, 6.49, 10),
labels = c("0", "1")),
dm_dx = as.numeric(dm_dx) - 1,
across(.cols = c(district, age, height, PAhour, dm_dx),
.fns = ~ as.integer(.))) %>%
relocate(weight, .after = height) %>%
select(id, district, everything(), -BMI) %>%
right_join(selected_district %>% mutate(n = 1:10), .,
by = c("n" = "district")) %>%
select(id, zone:district, gender:age, agegp, everything(), -n)# A tibble: 15 × 13
id zone state district gender age agegp ethnicity height weight PAhour
<int> <chr> <chr> <chr> <chr> <int> <fct> <chr> <int> <dbl> <int>
1 1 Borneo Sara… Pusa Male 28 20-29 Chinese 163 56.7 2
2 2 Borneo Sara… Pusa Female 53 50-59 Indian 164 63.2 5
3 3 Borneo Sara… Pusa Female 53 50-59 Chinese 164 71.1 3
4 4 Borneo Sara… Pusa Female 39 40-49 Malay 164 76.7 2
5 5 Borneo Sara… Pusa Female 32 30-39 Chinese 161 69.8 3
6 6 Borneo Sara… Pusa Male 28 20-29 Malay 168 71.5 5
7 7 Borneo Sara… Pusa Male 28 20-29 Chinese 164 63.9 3
8 8 Borneo Sara… Pusa Female 51 50-59 Indian 167 66.9 4
9 9 Borneo Sara… Pusa Male 23 20-29 Chinese 164 64.4 2
10 10 Borneo Sara… Pusa Male 23 20-29 Malay 166 79.3 3
11 11 Borneo Sara… Pusa Female 47 40-49 Malay 164 73.3 5
12 12 Borneo Sara… Pusa Female 37 30-39 Malay 170 74.8 3
13 13 Borneo Sara… Pusa Female 30 30-39 Malay 163 66.9 5
14 14 Borneo Sara… Pusa Male 37 30-39 Chinese 164 69.5 4
15 15 Borneo Sara… Pusa Male 30 30-39 Indian 163 73.9 5
# ℹ 2 more variables: hba1c <dbl>, dm_dx <int>
sample_sizes <- list("Utara" = 36, "Selatan" = 34,
"Timur" = 38, "Tengah" = 32, "Borneo" = 38)
simnhmsds_split <- simnhmsds0 %>%
group_split(zone, .keep = TRUE)
set.seed(121)
simnhmsds_final <- map(simnhmsds_split, function(data) {
zone <- unique(data$zone)
data %>%
group_by(district) %>%
slice_sample(n = sample_sizes[[zone]]) %>%
ungroup()
}) %>%
bind_rows() %>%
arrange(id) %>%
mutate(success = 1,
.before = 1)design_weight <- pop_mydist %>%
distinct(state, district) %>%
mutate(zone = case_when(state %in% c("Johor", "Melaka", "Negeri Sembilan") ~ "Selatan",
state %in% c("Kedah", "Perak", "Perlis", "Pulau Pinang") ~ "Utara",
state %in% c("Kelantan", "Pahang", "Terengganu") ~ "Timur",
state %in% c("Selangor", "W.P. Kuala Lumpur", "W.P. Putrajaya") ~ "Tengah",
state %in% c("Sabah", "Sarawak", "W.P. Labuan") ~ "Borneo")) %>%
count(zone) %>%
rename(total_district = n) %>%
mutate(selected_district = 2,
f1 = selected_district / total_district,
W1 = 1/f1)
design_weight# A tibble: 5 × 5
zone total_district selected_district f1 W1
<chr> <int> <dbl> <dbl> <dbl>
1 Borneo 68 2 0.0294 34
2 Selatan 20 2 0.1 10
3 Tengah 11 2 0.182 5.5
4 Timur 30 2 0.0667 15
5 Utara 31 2 0.0645 15.5
# A tibble: 10 × 5
zone state district fnr Fw
<chr> <chr> <chr> <dbl> <dbl>
1 Borneo Sarawak Pusa 0.95 1.05
2 Borneo Sarawak Asajaya 0.95 1.05
3 Selatan Melaka Jasin 0.85 1.18
4 Selatan Johor Muar 0.85 1.18
5 Tengah Selangor Kuala Selangor 0.8 1.25
6 Tengah Selangor Ulu Selangor 0.8 1.25
7 Timur Terengganu Kuala Nerus 0.95 1.05
8 Timur Kelantan Bachok 0.95 1.05
9 Utara Pulau Pinang Barat Daya 0.9 1.11
10 Utara Perak Bagan Datuk 0.9 1.11
district_adw <- nonresponse_weight %>%
left_join(.,
district_ws %>%
select(district, district_w1),
by = "district") %>%
mutate(district_adw = district_w1 * Fw)
district_adw# A tibble: 10 × 7
zone state district fnr Fw district_w1 district_adw
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Borneo Sarawak Pusa 0.95 1.05 1292 1360
2 Borneo Sarawak Asajaya 0.95 1.05 1292 1360
3 Selatan Melaka Jasin 0.85 1.18 340 400
4 Selatan Johor Muar 0.85 1.18 340 400
5 Tengah Selangor Kuala Selangor 0.8 1.25 176 220
6 Tengah Selangor Ulu Selangor 0.8 1.25 176 220
7 Timur Terengganu Kuala Nerus 0.95 1.05 570 600
8 Timur Kelantan Bachok 0.95 1.05 570 600
9 Utara Pulau Pinang Barat Daya 0.9 1.11 558 620
10 Utara Perak Bagan Datuk 0.9 1.11 558 620
ps_pop <- pop_mydist %>%
mutate(agegp = case_when(age %in% c("20-24", "24-29") ~ "20-29",
age %in% c("30-34", "35-39") ~ "30-39",
age %in% c("40-44", "45-49") ~ "40-49",
age %in% c("50-54", "55-59") ~ "50-59")) %>%
group_by(gender, agegp, ethnicity) %>%
summarise(population = sum(population),
.groups = "drop")
ps_pop %>%
mutate(popcoma = scales::label_comma()(population))# A tibble: 36 × 5
gender agegp ethnicity population popcoma
<fct> <chr> <fct> <dbl> <chr>
1 both 20-29 Malay 1607000 1,607,000
2 both 20-29 Chinese 502700 502,700
3 both 20-29 Indian 160800 160,800
4 both 30-39 Malay 2888300 2,888,300
5 both 30-39 Chinese 1059700 1,059,700
6 both 30-39 Indian 336000 336,000
7 both 40-49 Malay 2240100 2,240,100
8 both 40-49 Chinese 1058300 1,058,300
9 both 40-49 Indian 300800 300,800
10 both 50-59 Malay 1583300 1,583,300
# ℹ 26 more rows
ps_weight <- simnhmsds_final %>%
left_join(.,
district_adw %>%
select(district, district_adw),
by = join_by(district)) %>%
as_survey_design(id = 1,
weights = district_adw) %>%
group_by(gender, agegp, ethnicity) %>%
summarise(ps_adw = survey_total(success),
.groups = "drop") %>%
select(-ps_adw_se) %>%
left_join(simnhmsds_final %>%
count(gender, agegp, ethnicity),
.,
by = join_by(gender, agegp, ethnicity)) %>%
left_join(.,
ps_pop,
by = join_by(gender, agegp, ethnicity)) %>%
mutate(fps = population / ps_adw,
final_weight = 1/fps)# A tibble: 24 × 8
gender agegp ethnicity n ps_adw population fps final_weight
<chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 Female 20-29 Chinese 15 8220 241200 29.3 0.0341
2 Female 20-29 Indian 6 3380 76800 22.7 0.0440
3 Female 20-29 Malay 21 14380 777700 54.1 0.0185
4 Female 30-39 Chinese 16 10420 529300 50.8 0.0197
5 Female 30-39 Indian 6 4560 166100 36.4 0.0275
6 Female 30-39 Malay 19 15420 1445300 93.7 0.0107
7 Female 40-49 Chinese 4 1840 515800 280. 0.00357
8 Female 40-49 Indian 8 3820 146700 38.4 0.0260
9 Female 40-49 Malay 33 21300 1101900 51.7 0.0193
10 Female 50-59 Chinese 14 8020 428200 53.4 0.0187
# ℹ 14 more rows
simnhmsds_weight <- simnhmsds_final %>%
left_join(.,
ps_weight %>%
select(gender, agegp, ethnicity, final_weight),
by = join_by(gender, agegp, ethnicity))
simnhmsds_weight# A tibble: 356 × 15
success id zone state district gender age agegp ethnicity height weight
<dbl> <int> <chr> <chr> <chr> <chr> <int> <chr> <chr> <int> <dbl>
1 1 1 Born… Sara… Pusa Male 28 20-29 Chinese 163 56.7
2 1 2 Born… Sara… Pusa Female 53 50-59 Indian 164 63.2
3 1 3 Born… Sara… Pusa Female 53 50-59 Chinese 164 71.1
4 1 4 Born… Sara… Pusa Female 39 40-49 Malay 164 76.7
5 1 6 Born… Sara… Pusa Male 28 20-29 Malay 168 71.5
6 1 7 Born… Sara… Pusa Male 28 20-29 Chinese 164 63.9
7 1 8 Born… Sara… Pusa Female 51 50-59 Indian 167 66.9
8 1 9 Born… Sara… Pusa Male 23 20-29 Chinese 164 64.4
9 1 10 Born… Sara… Pusa Male 23 20-29 Malay 166 79.3
10 1 11 Born… Sara… Pusa Female 47 40-49 Malay 164 73.3
# ℹ 346 more rows
# ℹ 4 more variables: PAhour <int>, hba1c <dbl>, dm_dx <int>,
# final_weight <dbl>
svydesign() function from the survey package is used to define the complex survey design.ids: cluster id. ~1 if no clusterprobs or weights: sampling probability or weight, use only onestrata: strata id. NULL (or leave unspecified) if no stratadata: datasetIndependent Sampling design (with replacement)
svydesign(ids = ~1, weights = 1, data = simnhmsds_weight)
Probabilities:
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 1 1 1 1 1
Data variables:
[1] "success" "id" "zone" "state" "district"
[6] "gender" "age" "agegp" "ethnicity" "height"
[11] "weight" "PAhour" "hba1c" "dm_dx" "final_weight"
wtds_dsg <- svydesign(ids = ~district,
weights = ~final_weight,
strata = ~zone,
data = simnhmsds_weight)
summary(wtds_dsg)Stratified 1 - level Cluster Sampling design (with replacement)
With (10) clusters.
svydesign(ids = ~district, weights = ~final_weight, strata = ~zone,
data = simnhmsds_weight)
Probabilities:
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.48 35.83 52.25 54.56 65.99 280.33
Stratum Sizes:
Borneo Selatan Tengah Timur Utara
obs 76 68 64 76 72
design.PSU 2 2 2 2 2
actual.PSU 2 2 2 2 2
Data variables:
[1] "success" "id" "zone" "state" "district"
[6] "gender" "age" "agegp" "ethnicity" "height"
[11] "weight" "PAhour" "hba1c" "dm_dx" "final_weight"
svyby( ) functionComplex Sampling Design in NHMS